/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full.dta

Outputs: 	pdb_full_papers.dta

Purpose: 	Generate a dataset unique at the paper level
			Generate new variables specific to this project

*******************************************************************************/

clear all
use "${data_built}pdb_full.dta", clear


* Manually input entity ID if missing (only occurs in cases w/ 1 entity)
assert numEntities == 1 if entityId == .
replace entityId = 1 if entityId == .
isid structureId entityId

* Fill in missing publication years if pub year is missing but PMID is present
replace publicationYear = year(releaseDate) if publicationYear == . 		///
	& pubmedId != . 
	
* Flag the relevant paper-level variables and keep these
local paperVars pubmedId publicationYear title journalName volumeId  		///
	firstPage lastPage paperAuthor* firstPaperAuthor lastPaperAuthor  		///
	lastAuthorId numPaperAuthors stk_*  									///

	
keep `paperVars'
drop if pubmedId == .
sort pubmedId, stable
* Some text vars are inconsistent (e.g. Stein, C. and Stein, CM),
* so drop duplicates based on PubmedID
duplicates drop pubmedId, force
isid pubmedId
order `paperVars'
sort pubmedId

* Generate new variables

	* Inverse hyperbolic sine citations
	gen asinhCites3 = log(stk_cites_nslf3 + sqrt(1 + stk_cites_nslf3^2))
	gen asinhCites5 = log(stk_cites_nslf5 + sqrt(1 + stk_cites_nslf5^2))
	gen asinhCites10 = log(stk_cites_nslf10 + sqrt(1 + stk_cites_nslf10^2))

save "${data_built}pdb_full_papers.dta", replace


